
import json
import pandas as pd
from collections import defaultdict

from src.config import train_file, test_file, stop_words_path, synonym_file_path


def load_stop_words():
    stop_words = []
    with open(stop_words_path) as fr:
        for line in fr:
            if line:
                line = line.strip()
                stop_words.append(line)
    return stop_words


def read_xlsx_file(file):
    data = pd.read_excel(file, engine='openpyxl',keep_default_na=False)
    return [items for items in data.values]

def load_synonym():
    data = read_xlsx_file(synonym_file_path)
    synonyms= []
    for item in data:
        synonyms.append([item[1], item[0]])
    return synonyms


def get_labels():
    data = read_xlsx_file(train_file)
    labels = set()
    for items in data:
        labels.add(items[0])
    return list(labels)


def load_train_data():
    data = read_xlsx_file(train_file)
    train_data = []
    for items in data:
        content = items[1] + str(items[2])
        label = items[0]
        train_data.append((content, label))
    return train_data


def load_test_data():
    data = read_xlsx_file(test_file)
    train_data = []
    for items in data:
        content = items[2] + str(items[3])
        label = items[0]
        train_data.append((content, label))
    return train_data

synonyms = load_synonym()